Sector Fund

For this project sector fund Fidelity Select Technology Portfolio (FSPTX) is chosen as the target fund. On Fidelity’s website, this is categorized as: Large Growth

For comparison, Russel2000(^RUT), NASDAQ(^IXIC), S&P500(^GSPC),S&P500ITsector,S&PNorthAmericaTechSector S&PMidCap(^MID) and S&PSmlCap(^SML) was selected as initial indexes to be campared with.

for comparison, Vanguard’s similar index funds are also loaded as IT ETF(VGT),LargeCap ETF(VIGAX) and TotalMarket ETF(VTSAX). In these ETFs VGT serves as MSCI US IM Info. Tech. 25/50, VIGAX as CRSP US Large Cap Growth Index and VTSAX serves asCRSP US Total Market Index.

Before loading data, we will define some useful function to ease the data cleaning process, and claim some variables 1st.

## this function requires a dataframe input that has the daily close price named:Closed and a date column named: Date with format as: floating point and "xxxx(year)-xx(month)-xx(day)"
## We trimmed the data from 1990-10-07 because 1990-10-08 is a Monday and Stock market closed during weekend
dailynlogReturn <- function(Date1,DataFrame){
  DataFrame = mutate(DataFrame, 
                     dailyReturn = (Close-lag(Close))/Close,
                     log.Close = log(Close),
                     log.Return = log.Close-lag(log.Close))%>%
    mutate(perc_dailyRe = round(dailyReturn*100.0,3))%>%
    filter(Date >= Date1)%>%
    filter(Date <= as.Date("2018-12-31"))
}
## This function returns a projection value of the fund from the start date and assuming 10k investment from the start and reinvest all earnings
getProjectionValue <- function(DF){
  P0 = pull(filter(DF, Date == pull(top_n(DF["Date"],-1)))%>%select(Close))
  DF = mutate(DF,ProjValper10k = (Close*10000)/P0)
}
## Calculate Euclidean distances between two sets of data
sqerr <- function(x,y){
  z = x - y
  z = sqrt(dot(z,z)/length(y))
  return(z)
}

## function for standarize NAV
standardizedNAV = function(DF){
  return(mutate(DF,Close.z = (Close-mean(Close))/sd(Close)))
}

## restrict ourselves to study data after 2014-01-01
StartDate = as.Date("2014-01-01")

Load and clean the data.

Load the data:

FSPTX = dailynlogReturn(StartDate,read_csv("FSPTX.csv"))
NASDAQ = dailynlogReturn(StartDate,read_csv("^IXIC.csv"))
SnP500 = dailynlogReturn(StartDate,read_csv("^GSPC.csv"))
SnPMID = dailynlogReturn(StartDate,read_csv("^MID.csv"))
SnPSML = dailynlogReturn(StartDate,read_csv("^SML.csv"))
RUSSELL2000 = dailynlogReturn(StartDate,read_csv("^RUT.csv"))
VGT = dailynlogReturn(StartDate,read_csv("VGT.csv"))
VIGAX = dailynlogReturn(StartDate,read_csv("VIGAX.csv"))
VTSAX = dailynlogReturn(StartDate,read_csv("VTSAX.csv"))
SnP500Info <- dailynlogReturn(StartDate,read_csv("SnP500Info.csv"))
SnPNATech <- dailynlogReturn(StartDate,read_csv("SnPNATECH_clean.csv"))

Pick Dates where tax and dividen happens

Check DailyReturn Anormlies and set them to NA value

## Compare daily returns
dailyReturnComp = cbind(as.Date(FSPTX$Date),FSPTX$dailyReturn,NASDAQ$dailyReturn,SnP500$dailyReturn,VGT$dailyReturn,VIGAX$dailyReturn,VTSAX$dailyReturn)
colnames(dailyReturnComp) = c("Date","FSPTX","NASDAQ","SnP500","VGT","VIGAX","VTSAX")
epsilon = 0.000000000000000001
dailyReturnComp = data.frame(dailyReturnComp)%>%mutate(Date = as_date(Date),vsNASDAQ = ifelse(NASDAQ*NASDAQ<=epsilon,FSPTX,FSPTX/NASDAQ),vsSnP500 = ifelse(SnP500*SnP500<=epsilon,FSPTX,FSPTX/SnP500),vsVGT = ifelse(VGT*VGT<=epsilon,FSPTX,FSPTX/VGT),vsVIGAX = ifelse(VIGAX*VIGAX<=epsilon,FSPTX,FSPTX/VIGAX),vsVTSAX = ifelse(VTSAX*VTSAX<=epsilon,FSPTX,FSPTX/VTSAX))%>%mutate(minusNASDAQ = FSPTX-NASDAQ,minusSnP500 = FSPTX-SnP500,minusVGT = FSPTX-VGT,minusVIGAX= FSPTX-VIGAX,minusVTSAX = FSPTX-VTSAX)

plotly::plotly_build(ggplot(dailyReturnComp)+
               aes(x = Date,y = minusNASDAQ)+geom_point(alpha = .1)+
               geom_smooth(method = "loess",se = TRUE))
DividenDates <- dailyReturnComp%>%dplyr::filter(minusNASDAQ < -0.025)%>%dplyr::select(Date)%>%pull()



FSPTX <-  FSPTX%>%dplyr::filter(!Date %in% DividenDates)
NASDAQ  <- NASDAQ%>%dplyr::filter(!Date %in% DividenDates)
SnP500  <- SnP500%>%dplyr::filter(!Date %in% DividenDates)
SnPMID <- SnPMID%>%dplyr::filter(!Date %in% DividenDates)
SnPSML  <- SnPSML%>%dplyr::filter(!Date %in% DividenDates)
RUSSELL2000  <- RUSSELL2000%>%dplyr::filter(!Date %in% DividenDates)
VGT  <- VGT%>%dplyr::filter(!Date %in% DividenDates)
VIGAX  <- VIGAX%>%dplyr::filter(!Date %in% DividenDates)
VTSAX  <- VTSAX%>%dplyr::filter(!Date %in% DividenDates)
SnP500Info <- SnP500Info%>%dplyr::filter(!Date %in% DividenDates)
SnPNATech <- SnPNATech%>%dplyr::filter(!Date %in% DividenDates)


NASDAQ1 <- NASDAQ%>%dplyr::mutate(log.Close.adj = log(`Adj Close`),
                     log.Return.adj = log.Close.adj - lag(log.Close.adj))

FSPTX1 <- FSPTX%>%dplyr::mutate(log.Close.adj = log(`Adj Close`),
                     log.Return.adj = log.Close.adj - lag(log.Close.adj))

ytest <- FSPTX1$log.Return.adj-NASDAQ1$log.Return.adj
plotly::plotly_build(ggplot(FSPTX1)+
               aes(x = Date,y = ytest)+geom_point(alpha = .1)+
               geom_smooth(method = "loess",se = TRUE))
## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Check Correlation Map of log return

DailyReturncor = cbind(FSPTX$log.Return,
                       NASDAQ$log.Return,
                       RUSSELL2000$log.Return,
                       SnP500$log.Return,
                       SnP500Info$log.Return,
                       SnPNATech$log.Return,
                       SnPMID$log.Return,
                       SnPSML$log.Return,
                       VGT$log.Return,
                       VIGAX$log.Return,
                       VTSAX$log.Return)

colnames(DailyReturncor) = c("FSPTX",
                             "NASDAQ",
                             "RUSSELL2000",
                             "SnP500",
                             "SnP500Info",
                             "SnPNATech",
                             "SnPMID",
                             "SnPSML",
                             "VGT(IT ETF)",
                             "VIGAX(LargeCAP)",
                             "VTSAX(TotalMarket)")

DailyReturncor = data.frame(DailyReturncor)

#print("DailyReturn Correlation")
#cor(DailyReturncor,DailyReturncor)

colmat <- colorRampPalette(c("red", "white", "blue"))
corrplot::corrplot(cor(DailyReturncor,DailyReturncor),cl.lim = c(0.6,1.0),is.corr = FALSE,col = colmat(100),title = "Daily Log Return cor",type = "lower",tl.cex = .8,mar=c(1,1,2,1))

logreturncor <- data.frame(cor(DailyReturncor,DailyReturncor))
logreturncor%>%dplyr::select(1)

The best matches according to daily log return is: SnPNATech,SnP500Info and NASDAQ. Besides, VGT.IT.ETF also is highly correlated in terms of daily log return.

Try linear regression on log returns

logreturnmodel <- lm(FSPTX$log.Return~ NASDAQ$log.Return + SnP500Info$log.Return + SnPNATech$log.Return+VGT$log.Return)

summary(logreturnmodel)
## 
## Call:
## lm(formula = FSPTX$log.Return ~ NASDAQ$log.Return + SnP500Info$log.Return + 
##     SnPNATech$log.Return + VGT$log.Return)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.018660 -0.001623  0.000001  0.001659  0.011570 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -3.864e-05  8.019e-05  -0.482     0.63    
## NASDAQ$log.Return      2.319e-01  3.745e-02   6.192 8.06e-10 ***
## SnP500Info$log.Return -6.129e-01  6.707e-02  -9.138  < 2e-16 ***
## SnPNATech$log.Return   7.508e-01  6.237e-02  12.038  < 2e-16 ***
## VGT$log.Return         6.603e-01  7.666e-02   8.614  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.002829 on 1246 degrees of freedom
## Multiple R-squared:  0.9377, Adjusted R-squared:  0.9375 
## F-statistic:  4686 on 4 and 1246 DF,  p-value: < 2.2e-16
#NASDAQ$log.Return + SnP500Info$log.Return + SnPNATech$log.Return+VGT$log.Return -

  
plotly::plotly_build(ggplot(logreturnmodel)+
  aes(x = .fitted,y = .stdresid)+geom_point()+
  geom_abline(intercept = 2.0,slope = 0.0, linetype = "dashed")+
  geom_abline(intercept = -2.0,slope = 0.0, linetype = "dashed")+
  ylab("Standardized Residuals")+
  xlab("Fitted Values")+
  ggtitle("Standardized Residual Plot"))
#plot(logreturnmodel,which = 2)

plotly::plotly_build(ggplot(logreturnmodel)+
  aes(sample = .stdresid)+
  stat_qq() + stat_qq_line(linetype = "dashed")+
  ylab("Standardized Residuals")+
  xlab("Theoretical")+
  ggtitle("QQnorm Plot"))
logreturnmodel1 <- lm(FSPTX$log.Return~ SnP500Info$log.Return + SnPNATech$log.Return+VGT$log.Return)

summary(logreturnmodel1)
## 
## Call:
## lm(formula = FSPTX$log.Return ~ SnP500Info$log.Return + SnPNATech$log.Return + 
##     VGT$log.Return)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.019122 -0.001642  0.000046  0.001763  0.011625 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -6.182e-05  8.129e-05   -0.76    0.447    
## SnP500Info$log.Return -7.129e-01  6.606e-02  -10.79   <2e-16 ***
## SnPNATech$log.Return   9.395e-01  5.522e-02   17.01   <2e-16 ***
## VGT$log.Return         7.804e-01  7.527e-02   10.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.002871 on 1247 degrees of freedom
## Multiple R-squared:  0.9357, Adjusted R-squared:  0.9356 
## F-statistic:  6054 on 3 and 1247 DF,  p-value: < 2.2e-16
#NASDAQ$log.Return + SnP500Info$log.Return + SnPNATech$log.Return+VGT$log.Return -

  
plotly::plotly_build(ggplot(logreturnmodel1)+
  aes(x = .fitted,y = .stdresid)+geom_point()+
  geom_abline(intercept = 2.0,slope = 0.0, linetype = "dashed")+
  geom_abline(intercept = -2.0,slope = 0.0, linetype = "dashed")+
  ylab("Standardized Residuals")+
  xlab("Fitted Values")+
  ggtitle("Standardized Residual Plot"))
#plot(logreturnmodel,which = 2)

plotly::plotly_build(ggplot(logreturnmodel1)+
  aes(sample = .stdresid)+
  stat_qq() + stat_qq_line(linetype = "dashed")+
  ylab("Standardized Residuals")+
  xlab("Theoretical")+
  ggtitle("QQnorm Plot"))
logreturnmodel2 <- lm(FSPTX$log.Return~ SnPNATech$log.Return+VGT$log.Return)

summary(logreturnmodel2)
## 
## Call:
## lm(formula = FSPTX$log.Return ~ SnPNATech$log.Return + VGT$log.Return)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0195730 -0.0016780  0.0001288  0.0017423  0.0149358 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -5.475e-05  8.496e-05  -0.644 0.519470    
## SnPNATech$log.Return  7.834e-01  5.571e-02  14.062  < 2e-16 ***
## VGT$log.Return        2.206e-01  5.700e-02   3.870 0.000115 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.003001 on 1248 degrees of freedom
## Multiple R-squared:  0.9297, Adjusted R-squared:  0.9296 
## F-statistic:  8258 on 2 and 1248 DF,  p-value: < 2.2e-16
#NASDAQ$log.Return + SnP500Info$log.Return + SnPNATech$log.Return+VGT$log.Return -

  
plotly::plotly_build(ggplot(logreturnmodel2)+
  aes(x = .fitted,y = .stdresid)+geom_point()+
  geom_abline(intercept = 2.0,slope = 0.0, linetype = "dashed")+
  geom_abline(intercept = -2.0,slope = 0.0, linetype = "dashed")+
  ylab("Standardized Residuals")+
  xlab("Fitted Values")+
  ggtitle("Standardized Residual Plot"))
#plot(logreturnmodel,which = 2)

plotly::plotly_build(ggplot(logreturnmodel2)+
  aes(sample = .stdresid)+
  stat_qq() + stat_qq_line(linetype = "dashed")+
  ylab("Standardized Residuals")+
  xlab("Theoretical")+
  ggtitle("QQnorm Plot"))
logreturnmodel3 <- lm(FSPTX$log.Return~+ SnPNATech$log.Return)

summary(logreturnmodel3)
## 
## Call:
## lm(formula = FSPTX$log.Return ~ +SnPNATech$log.Return)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0195046 -0.0017508  0.0001291  0.0017999  0.0136216 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -5.645e-05  8.544e-05  -0.661    0.509    
## SnPNATech$log.Return  9.968e-01  7.803e-03 127.747   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.003018 on 1249 degrees of freedom
## Multiple R-squared:  0.9289, Adjusted R-squared:  0.9288 
## F-statistic: 1.632e+04 on 1 and 1249 DF,  p-value: < 2.2e-16
#NASDAQ$log.Return + SnP500Info$log.Return + SnPNATech$log.Return+VGT$log.Return -

  
plotly::plotly_build(ggplot(logreturnmodel3)+
  aes(x = .fitted,y = .stdresid)+geom_point()+
  geom_abline(intercept = 2.0,slope = 0.0, linetype = "dashed")+
  geom_abline(intercept = -2.0,slope = 0.0, linetype = "dashed")+
  ylab("Standardized Residuals")+
  xlab("Fitted Values")+
  ggtitle("Standardized Residual Plot"))
#plot(logreturnmodel,which = 2)

plotly::plotly_build(ggplot(logreturnmodel3)+
  aes(sample = .stdresid)+
  stat_qq() + stat_qq_line(linetype = "dashed")+
  ylab("Standardized Residuals")+
  xlab("Theoretical")+
  ggtitle("QQnorm Plot"))
logreturnmodel4 <- lm(FSPTX$log.Return~ VGT$log.Return)

summary(logreturnmodel4)
## 
## Call:
## lm(formula = FSPTX$log.Return ~ VGT$log.Return)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0197735 -0.0017167  0.0000787  0.0018789  0.0196980 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -4.031e-05  9.140e-05  -0.441    0.659    
## VGT$log.Return  1.014e+00  8.542e-03 118.736   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.003229 on 1249 degrees of freedom
## Multiple R-squared:  0.9186, Adjusted R-squared:  0.9186 
## F-statistic: 1.41e+04 on 1 and 1249 DF,  p-value: < 2.2e-16
#NASDAQ$log.Return + SnP500Info$log.Return + SnPNATech$log.Return+VGT$log.Return -

  
plotly::plotly_build(ggplot(logreturnmodel4)+
  aes(x = .fitted,y = .stdresid)+geom_point()+
  geom_abline(intercept = 2.0,slope = 0.0, linetype = "dashed")+
  geom_abline(intercept = -2.0,slope = 0.0, linetype = "dashed")+
  ylab("Standardized Residuals")+
  xlab("Fitted Values")+
  ggtitle("Standardized Residual Plot"))
#plot(logreturnmodel,which = 2)

plotly::plotly_build(ggplot(logreturnmodel4)+
  aes(sample = .stdresid)+
  stat_qq() + stat_qq_line(linetype = "dashed")+
  ylab("Standardized Residuals")+
  xlab("Theoretical")+
  ggtitle("QQnorm Plot"))

Save the predicted value as the index composit

regressiontable <- cbind(as.character.Date(FSPTX$Date),as.numeric(FSPTX$log.Return),as.numeric(logreturnmodel$fitted.values))
colnames(regressiontable) <- c("Date","FSPTX","composit")
regressiontable <- data.frame(regressiontable)
regressiontable <- regressiontable%>%mutate(Date = as.Date(Date),FSPTX = as.numeric(as.character(FSPTX)),composit = as.numeric(as.character(composit)))

regressiontable <-regressiontable%>%mutate(direction = if_else(FSPTX*composit > 0, 1, 0))

ggplot(regressiontable)+aes(x = composit, y = direction)+geom_point()

directionmodel <- glm(data = regressiontable, direction~composit,family = binomial(link = "logit"))
summary(directionmodel)
## 
## Call:
## glm(formula = direction ~ composit, family = binomial(link = "logit"), 
##     data = regressiontable)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1763   0.4424   0.4473   0.4524   0.4878  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.24226    0.09602   23.35   <2e-16 ***
## composit     3.47485    8.69147    0.40    0.689    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 790.71  on 1250  degrees of freedom
## Residual deviance: 790.55  on 1249  degrees of freedom
## AIC: 794.55
## 
## Number of Fisher Scoring iterations: 5
fitted.y = fitted(directionmodel); observed.y= regressiontable$direction
perf<- ROCR::performance(ROCR::prediction(fitted.y,observed.y) ,"tpr","fpr")
ROCR::plot(perf); abline(0,1,lty=2)

binnedplot(predict(directionmodel),resid(directionmodel))

directionvscomposit <- regressiontable%>%dplyr::group_by(direction)%>%dplyr::summarise(counts = n())
print("average percentage of fund moving with composit")
## [1] "average percentage of fund moving with composit"
directionvscomposit[2,2]%>%pull()/apply(directionvscomposit,FUN = sum,2)[2]
##    counts 
## 0.9040767
print("correlation of composit index")
## [1] "correlation of composit index"
cor(predict(directionmodel),regressiontable$composit)
## [1] 1
plotly::plotly_build(ggplot(regressiontable%>%dplyr::filter(direction == 1))+aes(x = FSPTX-composit)+geom_histogram(bins = 30,aes(y = ..density..),alpha = .5)+geom_density()+xlab("log return of FSPTX - log return of composit, when same direction"))
plotly::plotly_build(ggplot(regressiontable%>%dplyr::filter(direction == 0))+aes(x = FSPTX-composit)+geom_histogram(bins = 30,aes(y = ..density..),alpha = .5)+geom_density()+xlab("log return of FSPTX - log return of composit, when different direction"))